rm(list = ls())

library("tidystm")
library(quanteda)
library(stm)
library(tidyverse)
library(tidytext)
library(furrr)
library(ggplot2)
library(stringr)
library(plyr)
library(dplyr)
library(purrr)
library(tidyr)

load(file = "replication_supply_demand_r.Rdata")

##########################################################################
## Table 1 and Figure 2 ##################################################
##########################################################################

td_beta <- tidy(read)
td_gamma <- tidy(read, matrix = "gamma")

top_terms <- td_beta %>%
  arrange(beta) %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  arrange(-beta) %>%
  select(topic, term) %>% 
  reframe(terms = list(term)) %>%
  mutate(terms = map(terms, paste, collapse = ", ")) %>% 
  unnest_legacy()
top_terms

gamma_terms <- td_gamma %>%
  group_by(topic) %>%
  reframe(gamma = mean(gamma)) %>%
  arrange(desc(gamma)) %>%
  left_join(top_terms, by = "topic") %>%
  mutate(topic = paste0("Topic ", topic),
         topic = reorder(topic, gamma))
gamma_terms

## Table 1: Topics ordered by prevalence with top terms 
gamma_terms %>%
  select(topic, gamma, terms) %>%
  knitr::kable(digits = 3, 
        col.names = c("Topic number", "Expected proportion", "Top terms"))


## Figure 2: Crises Topic Prevalence Over Time 
effect <- estimateEffect(c(10, 16, 7, 14, 8, 17, 25, 28) ~ as.factor(year), read, metadata = stm2$meta)

labels_vec3 <- c("Topic 10: Eurozone Crisis", "Topic 16: Eurozone Crisis", 
                 "Topic 7: Eurozone Crisis", "Topic 14: The Covid-19 Pandemic", 
                 "Topic 8: Brexit",  "Topic 17: Rule of Law Crisis", 
                 "Topic 25: Migration Crisis", "Topic 28: Crimean Crisis")

effect <- lapply(c(1), function(i) {
  extract.estimateEffect(x = effect,
                         covariate = "year",
                         model = read,
                         method = "pointestimate",
                         labeltype = "custom",
                         custom.labels = labels_vec3
  )
})
effect <- do.call("rbind", effect)

ggplot(effect, aes(x = covariate.value, y = estimate,
                   ymin = ci.lower, ymax = ci.upper)) +
  facet_wrap(~ label, nrow = 4) +
  geom_ribbon(alpha = 1, fill = "grey70") +
  geom_line() +
  theme_gray() +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x="Year", y="Topic prevalence") 


##########################################################################
## Validating # of topics ################################################
##########################################################################

## Figure A2: Model Diagnostics
## Online appendix 

k_result <- many_models %>%
  mutate(exclusivity = map(topic_model, exclusivity),
         semantic_coherence = map(topic_model, semanticCoherence, dfm_corpus_all),
         eval_heldout = map(topic_model, eval.heldout, heldout$missing),
         residual = map(topic_model, checkResiduals, dfm_corpus_all),
         bound =  map_dbl(topic_model, function(x) max(x$convergence$bound)),
         lfact = map_dbl(topic_model, function(x) lfactorial(x$settings$dim$K)),
         lbound = bound + lfact,
         iterations = map_dbl(topic_model, function(x) length(x$convergence$bound)))

k_result %>%
  transmute(K,
            `Lower bound` = lbound,
            Residuals = map_dbl(residual, "dispersion"),
            `Semantic coherence` = map_dbl(semantic_coherence, mean),
            `Held-out likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
  tidyr::gather(Metric, Value, -K) %>%
  ggplot(aes(K, Value)) +
  theme_bw() +
  geom_line(size = 1.5, alpha = 1, show.legend = FALSE) +
  facet_wrap(~Metric, scales = "free_y") +
  labs(x = "K (number of topics)",
       y = NULL,
       title = "Model diagnostics by number of topics"
  )

k_result %>%
  select(K, exclusivity, semantic_coherence) %>%
  filter(K %in% c(10, 20, 30, 40, 50)) %>%
  tidyr::unnest(cols = c(exclusivity, semantic_coherence)) %>%
  mutate(K = as.factor(K)) %>%
  ggplot(aes(semantic_coherence, exclusivity, shape = factor(K))) +
  theme_gray() +
  geom_point(size = 2, alpha = 1) +
  labs(x = "Semantic coherence",
       y = "Exclusivity",
       title = "Comparing exclusivity and semantic coherence")

##########################################################################
## Content validation ####################################################
##########################################################################

## Figure A3: Selections of crisis text 
## Online appendix

thoughts8 <- stm::findThoughts(read,texts=corpus_all, topics=8, n=10)
stm::plotQuote(thoughts8$docs[[1]][4], main = "Topic 8: Brexit Crisis")
postscript("euro8.eps")
stm::plotQuote(thoughts8$docs[[1]][4], main = "Topic 8: Brexit Crisis")
dev.off()

thoughts14 <- findThoughts(read,texts=corpus_all, topics=14, n=10)
stm::plotQuote(thoughts14$docs[[1]][7], main = "Topic 14: Covid-19 Pandemic")
postscript("euro14.eps")
stm::plotQuote(thoughts14$docs[[1]][7], main = "Topic 14: Covid-19 Pandemic")
dev.off()

thoughts16 <- findThoughts(read,texts=corpus_all, topics=16, n=10)
stm::plotQuote(thoughts16$docs[[1]][3], main = "Topic 16: Eurozone Crisis")
postscript("euro16.eps")
stm::plotQuote(thoughts16$docs[[1]][3], main = "Topic 16: Eurozone Crisis")
dev.off()  

thoughts25 <- findThoughts(read,texts=corpus_all, topics=25, n=10)
stm::plotQuote(thoughts25$docs[[1]][9], main = "Topic 25: Migrant Crisis")
postscript("euro25b.eps")
stm::plotQuote(thoughts25$docs[[1]][9], main = "Topic 25: Migrant Crisis")
dev.off()  

thoughts28 <- findThoughts(read,texts=corpus_all, topics=28, n=10)
stm::plotQuote(thoughts28$docs[[1]][4], main = "Topic 28: Crimea Crisis")
postscript("euro28.eps")
plotQuote(thoughts28$docs[[1]][4], main = "Topic 28: Crimea Crisis")
dev.off()

thoughts17 <- findThoughts(read,texts=corpus_all, topics=17, n=10)
stm::plotQuote(thoughts17$docs[[1]][3], main = "Topic 17: Rule of Law Crisis")
postscript("euro17.eps")
plotQuote(thoughts17$docs[[1]][3], main = "Topic 17: Rule of Law Crisis")
dev.off()

thoughts10 <- findThoughts(read,texts=corpus_all, topics=10, n=15)
stm::plotQuote(thoughts10$docs[[1]][5], main = "Topic 10: Eurozone Crisis")
postscript("euro10.eps")
plotQuote(thoughts10$docs[[1]][15], main = "Topic 10: Eurozone Crisis")
dev.off()

thoughts7 <- findThoughts(read,texts=corpus_all, topics=7, n=15)
stm::plotQuote(thoughts7$docs[[1]][7], main = "Topic 7: Eurozone Crisis")
postscript("euro7.eps")
plotQuote(thoughts7$docs[[1]][7], main = "Topic 7: Eurozone Crisis")
dev.off()

## Figure A4: Topic 23 Prevalence and selected text
## Online appendix

effect2 <- estimateEffect(c(23) ~ as.factor(year), read, metadata = stm2$meta)
labels_vec4 <-c("General economic topic")

effect2 <- lapply(c(1), function(i) {
  extract.estimateEffect(x = effect2,
                         covariate = "year",
                         model = read,
                         method = "pointestimate",
                         labeltype = "custom",
                         custom.labels = labels_vec4
  )
})
effect2 <- do.call("rbind", effect2)

ggplot(effect2, aes(x = covariate.value, y = estimate,
                    ymin = ci.lower, ymax = ci.upper)) +
  geom_ribbon(alpha = 1 , fill = "grey70") +
  geom_line() +
  theme_gray() +
  theme(plot.title = element_text(hjust = 0.25)) +
  labs(x="Year", y="Topic prevalence") 

thoughts23 <- findThoughts(read,texts=corpus_all, topics=23, n=15)
stm::plotQuote(thoughts23$docs[[1]][7], main = "Topic 23: General Economic Topic")
postscript("euro23.eps")
plotQuote(thoughts23$docs[[1]][7], main = "Topic 23: General Economic Topic")
dev.off()
